library(GO.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
library(glue)
## 
## Attaching package: 'glue'
## The following object is masked from 'package:IRanges':
## 
##     trim
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::select()       masks AnnotationDbi::select()
## ✖ dplyr::slice()        masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)

if (str_detect(getwd(), "Bio_SDD")) {
    wd <- "/home/shannc/Bio_SDD/MUIC_senior_project/workflow"
    env <- "/home/shannc/Bio_SDD/miniconda3/envs/reticulate"
} else {
    wd <- "/home/shannc/workflow"
    env <- "/home/shannc/anaconda3/envs/reticulate"
}

go_file <- glue("{wd}/data/reference/go_data.tsv")
if (file.exists(go_file)) {
    info_tb <- read_tsv(go_file)
} else {
    all_gos <- as.list(GOTERM) %>% names()
    info_tb <- goInfoTb(all_gos)
    write_tsv(info_tb, go_file)
}
## Rows: 42443 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): GO_IDs, term, definition, ontology
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
showFrequent <- function(tb, core_word = NULL, filter_unwanted = TRUE) {
    if (!is.null(core_word)) {
        tb <- tb %>%
            filter(grepl(core_word, term))
    }
    tb <- tb %>%
        unnest_tokens(word, term) %>%
        count(word, sort = TRUE)
    if (filter_unwanted) {
        tb %>% filter(!word %in% UNWANTED)
    } else {
        tb
    }
}

Find the common verbs and qualifiers used in the GO

# Nouns
UNWANTED <- c(
    "of", "to", "cell", "in", "complex", "activity", "regulation", "process", "cellular", "stimulus",
    "response"
)

qualifiers <- c("positive", "negative", "catabolic", "involved")